*** How worried should we be? The implications of fabricated survey data for political science
*** Table A11. Item-Level Effects of Fabricated Data using Reduced Clean Data

set more off

* set directory to location of dataset in following line
cd "C:\~\Downloads\"

use "VEN_fraud_data.dta", clear

gen env2b = env2b1 if env2b2 == 999999
replace env2b = env2b2 if env2b1 == 999999
gen drk1 = drk11 if drk12 == 999999
replace drk1 = drk12 if drk11 == 999999

** label data for full sample comparisons
* first drop canceled cases that were not likelyfrauds and matched to clean cases
drop if clean_data == 0 & cem_matched != 1
* 1 = fake, 2 = clean matched, 3 = rest of data
gen comparison_groups = 1 if likelyfraud == 1 & cem_matched == 1
replace comparison_groups = 2 if likelyfraud == 0 & cem_matched == 1
replace comparison_groups = 3 if likelyfraud == 0 & cem_matched == 0

* double the "rest of the dataset" (non-matched clean interviews)
gen exp=1
replace exp=2 if comparison_group == 3
expand exp, gen(copy)

* compromised versus clean indicator (1 = clean, 2 = compromised)
gen clean_or_compr = .
replace clean_or_compr = 1 if comparison_group == 2 | copy == 0
replace clean_or_compr = 2 if comparison_group == 1 | copy == 1

gen clean = 1 if clean_or_compr == 1
replace clean = 0 if clean_or_compr == 2
lab define clean_comp_lab 0 "Compromised" 1 "Clean"
lab values clean clean_comp_lab
drop if clean_or_compr == 2 & clean_data == 0

*scale items analysis
* non-response
postfile qdata str32 question nonresponse_clean nonresponse_fraud nr_diff diff_nr_zstat using "questions_full_nr.dta", replace
foreach x of varlist l1 venl2 venl3 b1 b2 b3 b4 b6 b43 b11 b12 b13 b18 b21 b21a b32 b37 b47a venb11 venb10 venb51 venvb18 venvb19 venvb20 pr3dn pr3en polz1 ///
ros1 ros4 ing4 eff1 eff2 vengrp15 vengrp16 aoj22new media3 media4 media4b media1 media2 media2b dst1b env1c e5 e15 e3 e16 ///
d1 d2 d3 d4 d5 d6 vengrp5 vengrp6 vengrp7 vengrp8 vengrp9 vengrp10 vengrp11 vengrp12 vengrp13 vengrp14 venps1 venps2 venps3 venct1 venct2 venct3 ///
ls3 soct2 idio2 cp6 cp7 cp8 cp13 cp20 it1 aoj11 aoj12 polz1a m1 m2 drk1 env2b pn4 pn5 lib1 lib2b lib2c ///
lib4 fear11 pol1 vengrp1 vengrp2 vengrp3 vengrp4 mil10a1 mil10e1 mil10un1 mil10un2 mil10a2 mil10e2 venesc2b venesc3 ///
sd2new2 sd3new2 sd6new2 venprot10 venprot12 venprot11 mil10oas1 mil10oas2 {
	di "`x'"
	gen `x'_nr = 1 if inlist(`x', 888888, 988888, 99)
	replace `x'_nr = 0 if `x'_nr == . & `x' != 999999
	replace `x'_nr = . if `x' == 999999
	prtest `x'_nr, by(clean_or_compr)
	post qdata ("`x'") (r(P_1)) (r(P_2)) (r(P_1)-r(P_2)) (r(z))
}
*
postclose qdata
* diff of means
postfile qdata str32 question clean_mu clean_sd fraud_mu fraud_sd diff diff_se n1 n2 dof pval using "questions_full_mean.dta", replace
foreach x of varlist l1 venl2 venl3 b1 b2 b3 b4 b6 b43 b11 b12 b13 b18 b21 b21a b32 b37 b47a venb11 venb10 venb51 venvb18 venvb19 venvb20 pr3dn pr3en polz1 ///
ros1 ros4 ing4 eff1 eff2 vengrp15 vengrp16 aoj22new media3 media4 media4b media1 media2 media2b dst1b env1c e5 e15 e3 e16 ///
d1 d2 d3 d4 d5 d6 vengrp5 vengrp6 vengrp7 vengrp8 vengrp9 vengrp10 vengrp11 vengrp12 vengrp13 vengrp14 venps1 venps2 venps3 venct1 venct2 venct3 ///
ls3 soct2 idio2 cp6 cp7 cp8 cp13 cp20 it1 aoj11 aoj12 polz1a m1 m2 drk1 env2b pn4 pn5 lib1 lib2b lib2c ///
lib4 fear11 pol1 vengrp1 vengrp2 vengrp3 vengrp4 mil10a1 mil10e1 mil10un1 mil10un2 mil10a2 mil10e2 venesc2b venesc3 ///
sd2new2 sd3new2 sd6new2 venprot10 venprot12 venprot11 mil10oas1 mil10oas2 {
	di "`x'"
	recode `x' (888888 988888 999999 99 = .)
	summ `x'
	gen `x'_rs = (`x' - r(min))/(r(max) - r(min)) * 100
	ttest `x'_rs, by(clean_or_compr) unequal
	post qdata ("`x'") (r(mu_1)) (r(sd_1)) (r(mu_2)) (r(sd_2)) (r(mu_1)-r(mu_2)) (r(se)) (r(N_1)) (r(N_2)) (r(df_t)) (r(p))
}
*
postclose qdata
* diff of sd
postfile qdata str32 question sd_clean sd_fraud diff_sd_fstat diff_sd_pval using "questions_full_sd.dta", replace
foreach x of varlist l1 venl2 venl3 b1 b2 b3 b4 b6 b43 b11 b12 b13 b18 b21 b21a b32 b37 b47a venb11 venb10 venb51 venvb18 venvb19 venvb20 pr3dn pr3en polz1 ///
ros1 ros4 ing4 eff1 eff2 vengrp15 vengrp16 aoj22new media3 media4 media4b media1 media2 media2b dst1b env1c e5 e15 e3 e16 ///
d1 d2 d3 d4 d5 d6 vengrp5 vengrp6 vengrp7 vengrp8 vengrp9 vengrp10 vengrp11 vengrp12 vengrp13 vengrp14 venps1 venps2 venps3 venct1 venct2 venct3 ///
ls3 soct2 idio2 cp6 cp7 cp8 cp13 cp20 it1 aoj11 aoj12 polz1a m1 m2 drk1 env2b pn4 pn5 lib1 lib2b lib2c ///
lib4 fear11 pol1 vengrp1 vengrp2 vengrp3 vengrp4 mil10a1 mil10e1 mil10un1 mil10un2 mil10a2 mil10e2 venesc2b venesc3 ///
sd2new2 sd3new2 sd6new2 venprot10 venprot12 venprot11 mil10oas1 mil10oas2 {
	di "`x'"
	sdtest `x'_rs, by(clean_or_compr)
	post qdata ("`x'") (r(sd_1)) (r(sd_2)) (r(F)) (r(p))
}
*
postclose qdata
* scale of items
postfile qdata str32 question scale using "questions_full_sc.dta", replace
foreach x of varlist l1 venl2 venl3 b1 b2 b3 b4 b6 b43 b11 b12 b13 b18 b21 b21a b32 b37 b47a venb11 venb10 venb51 venvb18 venvb19 venvb20 pr3dn pr3en polz1 ///
ros1 ros4 ing4 eff1 eff2 vengrp15 vengrp16 aoj22new media3 media4 media4b media1 media2 media2b dst1b env1c e5 e15 e3 e16 ///
d1 d2 d3 d4 d5 d6 vengrp5 vengrp6 vengrp7 vengrp8 vengrp9 vengrp10 vengrp11 vengrp12 vengrp13 vengrp14 venps1 venps2 venps3 venct1 venct2 venct3 ///
ls3 soct2 idio2 cp6 cp7 cp8 cp13 cp20 it1 aoj11 aoj12 polz1a m1 m2 drk1 env2b pn4 pn5 lib1 lib2b lib2c ///
lib4 fear11 pol1 vengrp1 vengrp2 vengrp3 vengrp4 mil10a1 mil10e1 mil10un1 mil10un2 mil10a2 mil10e2 venesc2b venesc3 ///
sd2new2 sd3new2 sd6new2 venprot10 venprot12 venprot11 mil10oas1 mil10oas2 {
	di "`x'"
	qui summ `x'
	post qdata ("`x'") (r(max))
}
*
postclose qdata

use questions_full_mean.dta, clear
merge 1:1 question using questions_full_sd.dta, nogen
merge 1:1 question using questions_full_nr.dta, nogen
merge 1:1 question using questions_full_sc.dta, nogen
save questions_full_differences.dta, replace

erase questions_full_sd.dta
erase questions_full_mean.dta
erase questions_full_nr.dta
erase questions_full_sc.dta

gen mean_comp_test = 1 if pval < .05
replace mean_comp_test = 0 if pval >= .05
gen mean_comp_test90 = 1 if pval < .10
replace mean_comp_test90 = 0 if pval >= .10
gen var_comp_test = 1 if diff_sd_pval < .05
replace var_comp_test = 0 if diff_sd_pval >= .05
gen var_comp_test90 = 1 if diff_sd_pval < .10
replace var_comp_test90 = 0 if diff_sd_pval >= .10
gen nr_diff_test = 1 if abs(diff_nr_zstat) > 1.96
replace nr_diff_test = 0 if abs(diff_nr_zstat) <= 1.96
gen nr_diff_test90 = 1 if abs(diff_nr_zstat) > 1.64
replace nr_diff_test90 = 0 if abs(diff_nr_zstat) <= 1.64
save questions_full_differences.dta, replace

** with bonferroni and sidak corrections
use questions_full_differences.dta, clear
* bonferroni
gen mean_comp_test_bonf = 1 if pval < (.05/_N)
replace mean_comp_test_bonf = 0 if pval >= (.05/_N)
gen mean_comp_test_bonf90 = 1 if pval < (.1/_N)
replace mean_comp_test_bonf90 = 0 if pval >= (.1/_N)
gen var_comp_test_bonf = 1 if diff_sd_pval < (.05/_N)
replace var_comp_test_bonf = 0 if diff_sd_pval >= (.05/_N)
gen var_comp_test_bonf90 = 1 if diff_sd_pval < (.1/_N)
replace var_comp_test_bonf90 = 0 if diff_sd_pval >= (.1/_N)
gen nr_diff_test_bonf = 1 if abs(diff_nr_zstat) > 3.31216
replace nr_diff_test_bonf = 0 if abs(diff_nr_zstat) <= 3.31216
gen nr_diff_test_bonf90 = 1 if abs(diff_nr_zstat) > 3.12637
replace nr_diff_test_bonf90 = 0 if abs(diff_nr_zstat) <= 3.12637
gen magn_diff_sd = abs(diff) / clean_sd

* holm's step-down procedure (alpha = .10)
gsort pval
gen mean_pval_rank = _n
gen mean_test_holm_alpha = .10 / (_N - mean_pval_rank + 1)
gen mean_comp_test_holm = 1 if pval < mean_test_holm_alpha
replace mean_comp_test_holm = 0 if pval > mean_test_holm_alpha

gsort diff_sd_pval
gen var_pval_rank = _n
gen var_test_holm_alpha = .10 / (_N - var_pval_rank + 1)
gen var_comp_test_holm = 1 if pval < var_test_holm_alpha
replace var_comp_test_holm = 0 if pval > var_test_holm_alpha

gen abs_diff_nr_zstat = abs(diff_nr_zstat) 
gsort -abs_diff_nr_zstat
gen nr_z_rank = _n
gen nr_test_holm_alpha = abs(invnormal((.10 / (_N - nr_z_rank + 1))/2))
gen nr_comp_test_holm = 1 if abs_diff_nr_zstat > nr_test_holm_alpha
replace nr_comp_test_holm = 0 if abs_diff_nr_zstat < nr_test_holm_alpha

* hochberg's step-down procedure (alpha = .10)
gsort -pval
gen mean_pval_rank_r = _n
gen mean_test_hoch_alpha = .10 / (_N - mean_pval_rank_r + 1)
gen mean_comp_test_hoch = 1 if pval < mean_test_hoch_alpha
replace mean_comp_test_hoch = 0 if pval > mean_test_hoch_alpha

gsort -diff_sd_pval
gen var_pval_rank_r = _n
gen var_test_hoch_alpha = .10 / (_N - var_pval_rank_r + 1)
gen var_comp_test_hoch = 1 if pval < var_test_hoch_alpha
replace var_comp_test_hoch = 0 if pval > var_test_hoch_alpha

gsort abs_diff_nr_zstat
gen nr_z_rank_r = _n
gen nr_test_hoch_alpha = abs(invnormal((.10 / (_N - nr_z_rank_r + 1))/2))
gen nr_comp_test_hoch = 1 if abs_diff_nr_zstat > nr_test_hoch_alpha
replace nr_comp_test_hoch = 0 if abs_diff_nr_zstat < nr_test_hoch_alpha


*** TABLE A11 RESULTS ***
** Row 1 (Difference in means) range of values
tab1 mean_comp_test_bonf90 mean_comp_test_holm mean_comp_test_hoch mean_comp_test90
** Row 2 (Average magnitude) range of values
summ magn_diff_sd if mean_comp_test90 == 1
summ magn_diff_sd if mean_comp_test_bonf90 == 1
summ magn_diff_sd if mean_comp_test_holm == 1
summ magn_diff_sd if mean_comp_test_hoch == 1
** Row 3 (Difference in variances) range of values
tab1 var_comp_test_bonf90 var_comp_test_holm var_comp_test_hoch var_comp_test90
** Row 4 (Item nonresponse) range of values
tab1 nr_diff_test_bonf90 nr_comp_test_holm nr_comp_test_hoch nr_diff_test90

